wine = read.csv("wine-quality-white-and-red.csv")
head(wine)
str(wine)
wine[, 'type'] <- as.factor(wine[, 'type'])
str(wine)
nrow(wine)
ncol(wine)
wine <- wine[!duplicated(wine), ]
nrow(wine)
set.seed(42)
rows <- sample(nrow(wine))
wine <- wine[rows, ]
nrow(wine)
library(ggplot2)
for (xx in 1:(length(wine)-2) ) {
  for (yy in (xx+1):(length(wine)-1) ) {
    print(xx)
    print(yy)
    p <- ggplot(wine, aes(x=wine[,xx], y=wine[,yy], color=quality)) +
      geom_point() +
      labs( x = colnames(wine)[xx], y = colnames(wine)[yy], title = paste(colnames(wine)[yy],"vs",colnames(wine)[xx]) )
    print(p)
  }
}

set.seed(1000)
qualityvariable <- ifelse(wine$quality > 5, "high", "low")
wineknn<- data.frame(wine, qualityvariable)
wineknn <- wineknn[, -13]
wineknn <- wineknn[, -1]
head(wineknn)
scaleddata <- as.data.frame(scale(wineknn[1:11], center = TRUE, scale = TRUE))
head(scaleddata)
wine_sample <- sample(2, nrow(scaleddata), replace=TRUE, prob=c(0.67, 0.33))
wine_training <- scaleddata[wine_sample==1, ]
head(wine_training)
wine_test <- scaleddata[wine_sample==2, ]
nrow(wine_test)
nrow(wine)
wine.trainLabels <- wineknn[wine_sample==1, 12]

wine.testLabels <- wineknn[wine_sample==2, 12]
length(wine.testLabels)
library(class)
library(caret)
nrow(wine_training)
length(wine.trainLabels)
wine_model_pred <- knn(train = wine_training, test = wine_test, cl=wine.trainLabels, k=19)
loadPkg("gmodels")
crosst <- CrossTable(wine.testLabels, wine_model_pred, prop.chisq = FALSE)
crosst
cm = confusionMatrix(wine_model_pred, reference = as.factor(wine.testLabels) ) # from caret library

print( paste("Total Accuracy = ", cm$overall['Accuracy'] ) )
chooseK = function(k, train_set, val_set, train_class, val_class){
  
  # Build knn with k neighbors considered.
  set.seed(1)
  class_knn = knn(train = train_set,    #<- training set cases
                  test = val_set,       #<- test set cases
                  cl = train_class,     #<- category for classification
                  k = k) #,                #<- number of neighbors considered
                  # use.all = TRUE)       #<- control ties between class assignments. If true, all distances equal to the k-th largest are included
  
  tab = table(class_knn, val_class)
  
  # Calculate the accuracy.
  accu = sum(tab[row(tab) == col(tab)]) / sum(tab)                         
  cbind(k = k, accuracy = accu)
}

# The sapply() function plugs in several values into our chooseK function.
# function(x)[function] allows you to apply a series of numbers
# to a function without running a for() loop.
knn_different_k = sapply(seq(1, 30, by = 2),  #<- set k to be odd number from 1 to 21
                         function(x) chooseK(x, 
                                             train_set = wine_training,
                                             val_set =wine_test,
                                             train_class =wine.trainLabels,
                                             val_class =wine.testLabels ))

# Reformat the results to graph the results.
str(knn_different_k)
knn_different_k = data.frame(k = knn_different_k[1,],
                             accuracy = knn_different_k[2,])
knn_different_k
# Plot accuracy vs. k.
# install.packages("ggplot2")
loadPkg("ggplot2")

ggplot(knn_different_k,
       aes(x = k, y = accuracy)) +
  geom_line(color = "orange", linewidth = 1.5) +
  geom_point(size = 3) + 
  labs(title = "accuracy vs k")

## DECISION TREES

set.seed(1000)
qualityvariable <- ifelse(wine$quality > 5, "high", "low")
winedc<- data.frame(wine, qualityvariable)
table(winedc$qualityvariable)
winedc <- winedc[, -13]
winedc <- winedc[, -1]
wine_sample_for_dc <- sample(2, nrow(winedc), replace=TRUE, prob=c(0.67, 0.33))
wine_training_for_dc <- winedc[wine_sample_for_dc==1, ]
nrow(wine_training_for_dc)

wine_test_for_dc <- winedc[wine_sample_for_dc==2, ]
nrow(wine_test_for_dc)
library(rpart)
qualityfit <- rpart(qualityvariable ~ ., data=wine_training_for_dc, method="class",maxdepth = 5 )

#qualityfit <- rpart(qualityvariable ~ ., data=wine_training_for_dc, method="class",minsplit = 5, maxdepth = 5, minbucket = 5 )
summary(qualityfit)
plot(qualityfit, uniform=TRUE, main="Classification Tree for Kyphosis")
text(qualityfit, use.n=TRUE, all=TRUE, cex=.8)

loadPkg("rpart.plot")
rpart.plot(qualityfit)

prediction <- predict(qualityfit, wine_test_for_dc[, -13], type = "class")
print(prediction)
library(caret)


cm = confusionMatrix(prediction, as.factor(wine_test_for_dc$qualityvariable))
print('Overall: ')
cm$overall
print('Class: ')
cm$byClass
loadPkg("rpart")
loadPkg("caret")


confusionMatrixResultDf = data.frame( Depth=numeric(0), Accuracy= numeric(0), Sensitivity=numeric(0), Specificity=numeric(0), Pos.Pred.Value=numeric(0), Neg.Pred.Value=numeric(0), Precision=numeric(0), Recall=numeric(0), F1=numeric(0), Prevalence=numeric(0), Detection.Rate=numeric(0), Detection.Prevalence=numeric(0), Balanced.Accuracy=numeric(0), row.names = NULL )

for (deep in 2:10) {
 qualityfit_model <- rpart(qualityvariable ~ ., data=wine_training_for_dc, method="class",control = list(maxdepth = deep))
  predicted = predict(qualityfit_model, wine_test_for_dc[, -13], type = "class")
  cm = confusionMatrix(predicted, as.factor(wine_test_for_dc$qualityvariable)) # from caret library
  # 
  cmaccu = cm$overall['Accuracy']
  # print( paste("Total Accuracy = ", cmaccu ) )
  # 
  cmt = data.frame(Depth=deep, Accuracy = cmaccu, row.names = NULL ) # initialize a row of the metrics 
  cmt = cbind( cmt, data.frame( t(cm$byClass) ) ) # the dataframe of the transpose, with k valued added in front
  confusionMatrixResultDf = rbind(confusionMatrixResultDf, cmt)
  # print("Other metrics : ")
confusionMatrixResultDf
}
loadPkg("rpart.plot")
rpart.plot(qualityfit)

plotcp(qualityfit)

prqualityfit <- prune(qualityfit, cp = 0.003 )
# Compute the accuracy of the pruned tree
pred<- predict(prqualityfit, wine_test_for_dc[,-13], type = "class")
accuracy_prun <- mean(pred == as.factor(wine_test_for_dc$qualityvariable))
data.frame( accuracy_prun)
rpart.plot(prqualityfit)

0.1 Logistic Regression

set.seed(1000)
qualityvariable <- ifelse(wine$quality > 5, 1, 0)
winelogit<- data.frame(wine, qualityvariable)

winelogit <- winelogit[, -c(1,13)]
#winelogit <- winelogit[, -1]
head(winelogit)

wine_sample <- sample(2, nrow(winelogit), replace=TRUE, prob=c(0.67, 0.33))
wine_training <- winelogit[wine_sample==1, ]
head(wine_training)
wine_test <- winelogit[wine_sample==2, ]
nrow(wine_test)
nrow(wine_training)
#head(winelogit)
#wine.trainLabels_logit <- winelogit[wine_sample==1, 12]
#length(wine.trainLabels_logit)

#wine.testLabels_logit <- winelogit[wine_sample==2, 12]
head(winelogit)
corrlogit = cor(wine_training[,-12])
corrlogit
loadPkg("corrplot")
corrplot(corrlogit,type="lower", method = "square")

logitmodel <-  glm(qualityvariable ~fixed.acidity + volatile.acidity + citric.acid + 
    residual.sugar + free.sulfur.dioxide + pH+ sulphates + alcohol , data = wine_training, family= "binomial")
summary(logitmodel)
expcoeff = exp(coef(logitmodel))
expcoeff
head(wine_training)
fitted.results <- predict(logitmodel,newdata=subset(wine_test,select=c(1,2,3,4,5,6,8,9,10,11)),type='response')
fitted.results_val <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results_val != wine_test$qualityvariable)
print(paste('Accuracy',1-misClasificError))
library(car)
vif(logitmodel)
length(fitted.results)
length(wine_test$qualityvariable)
confusionMatrix(as.factor(fitted.results_val), as.factor(wine_test$qualityvariable))
loadPkg("pROC")

wine_test$prob=fitted.results
h <- roc(qualityvariable~prob, data=wine_test)
auc(h) # area-under-curve prefer 0.8 or higher.
plot(h)

## FEATURE SELCTION FOR LOGISTIC REGRESSION

loadPkg("leaps")
reg.leaps <- regsubsets(qualityvariable~fixed.acidity + volatile.acidity + citric.acid + 
    residual.sugar + free.sulfur.dioxide + pH+ sulphates + alcohol, data = wine_training, nbest = 1, method = "exhaustive")  # leaps, 
plot(reg.leaps, scale = "adjr2", main = "Adjusted R^2")

plot(reg.leaps, scale = "bic", main = "BIC")

plot(reg.leaps, scale = "Cp", main = "Cp")

loadPkg("bestglm")
library(bestglm)
head(wine_test)
wine_test_1 = wine_test[,c(-7)]
head(wine_test_1)
res.bestglm <- bestglm(Xy = wine_test_1,
            IC = "AIC",                 # Information criteria for
            method = "exhaustive")
summary(res.bestglm)
res.bestglm$BestModels